---
title: "figure6-won-plot"
format: html
editor: visual
---

## Making Figure 6 (Won et al)

```{r}
library(devtools)
install.packages("lme4", type = "source")
install_github("naoki-egami/dsl", ref = "update-Aug2024", dependencies = TRUE, force = TRUE)
library(dsl)
library(janitor)
library(tidyverse)
library(broom)
library(jtools)
library(tidyverse)
library(ggplot2)
library(broom.mixed)
library(estimatr)
```

```{r}
won_machine_df <- read.csv(file = "~/Downloads/DSL images/WonetAl_result.csv")
won_human_df <- read.table(file="~/Downloads/DSL images/annot_test.txt", sep="\t", quote="", comment.char="")
# some basic cleaning
won_human_df <- as.data.frame(won_human_df)
won_human_df <- won_human_df %>% row_to_names(row_number = 1)
won_machine_df['protest_h'] <- won_human_df['protest']
won_protest_human <- subset(won_human_df,protest == 1)
won_protest_machine <- subset(won_machine_df,protest_h == 1)

won_protest_machine['sign'] <- won_protest_human['sign']
won_protest_machine['photo'] <- won_protest_human['photo']
won_protest_machine['fire'] <- won_protest_human['fire']
won_protest_machine['police'] <- won_protest_human['police']
won_protest_machine['children'] <- won_protest_human['children']
won_protest_machine['group_20'] <- won_protest_human['group_20']
won_protest_machine['flag'] <- won_protest_human['flag']
won_protest_machine['night'] <- won_protest_human['night']
won_protest_machine['shouting'] <- won_protest_human['shouting']

won_protest_machine$violence_truth <- won_protest_human$violence
```

```{r}
won_sampling <- function(won_protest_machine, N, seed){
  set.seed(seed)
  # Sample with replacement from the rows of the data frame
  sample_won <- won_protest_machine[sample(nrow(won_protest_machine), size = nrow(won_protest_machine), replace = TRUE), ]

  # DSL iterations
  sample_won$row = 1:nrow(sample_won)
  
  sub_sample_indices <- sample(nrow(sample_won), size = (nrow(sample_won) - N))
  sub_sample_won <- sample_won
  sub_sample_won$row <- seq_len(nrow(sub_sample_won))
  # Assign NA to our labeled observations 
  sub_sample_won$violence_truth[sub_sample_indices] <- NA
  return(sub_sample_won)
}
```

```{r}
# number of times we want to run this
k = 500

# set seed for replicability
set.seed(1678990064)
seeds <- sample(x=1:99999,size=k)


N = 600

multiple_runs_sign_600 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_photo_600 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_fire_600 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_police_600 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_children_600 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_group_600 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate","std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_flag_600 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate","std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_night_600 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
multiple_runs_shouting_600 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))

smultiple_runs_sign_600 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
smultiple_runs_photo_600 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
smultiple_runs_fire_600 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
smultiple_runs_police_600 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
smultiple_runs_children_600 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
smultiple_runs_group_600 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate","std.error", "conf.low", "conf.high", "p.value"))
smultiple_runs_flag_600 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate","std.error", "conf.low", "conf.high", "p.value"))
smultiple_runs_night_600 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
smultiple_runs_shouting_600 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))


rmultiple_runs_sign_600 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
rmultiple_runs_photo_600 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
rmultiple_runs_fire_600 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
rmultiple_runs_police_600 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
rmultiple_runs_children_600 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
rmultiple_runs_group_600 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate","std.error", "conf.low", "conf.high", "p.value"))
rmultiple_runs_flag_600 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate","std.error", "conf.low", "conf.high", "p.value"))
rmultiple_runs_night_600 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))
rmultiple_runs_shouting_600 <- data.frame(matrix(nrow = 5,ncol = k), row.names = c("estimate", "std.error", "conf.low", "conf.high", "p.value"))



# DSL
for(i in 1:k){
  df <- won_sampling(won_protest_machine, N, seeds[i])
  dsl_model_i <- dsl(model = "lm",  formula = violence_truth ~ sign + photo + fire + police + children + group_20 + flag + night + shouting,   predicted_var = "violence_truth", prediction = "violence",   data = df, tuning = TRUE)
  d <- data.frame(summary(dsl_model_i))
  multiple_runs_sign_600[,i] <- c(d$Estimate[2], d$Std..Error[2], d$CI.Lower[2], d$CI.Upper[2], d$p.value[2])
  multiple_runs_photo_600[,i] <- c(d$Estimate[3], d$Std..Error[3], d$CI.Lower[3], d$CI.Upper[3], d$p.value[3])
  multiple_runs_fire_600[,i] <- c(d$Estimate[4], d$Std..Error[4], d$CI.Lower[4], d$CI.Upper[4], d$p.value[4])
  multiple_runs_police_600[,i] <-  c(d$Estimate[5], d$Std..Error[5], d$CI.Lower[5], d$CI.Upper[5], d$p.value[5])
  multiple_runs_children_600[,i]<-  c(d$Estimate[6], d$Std..Error[6], d$CI.Lower[6], d$CI.Upper[6], d$p.value[6])
  multiple_runs_group_600[,i] <-  c(d$Estimate[7], d$Std..Error[7], d$CI.Lower[7], d$CI.Upper[7], d$p.value[7])
  multiple_runs_flag_600[,i] <-  c(d$Estimate[8], d$Std..Error[8], d$CI.Lower[8], d$CI.Upper[8], d$p.value[8])
  multiple_runs_night_600[,i] <-  c(d$Estimate[9], d$Std..Error[9], d$CI.Lower[9], d$CI.Upper[9], d$p.value[9])
  multiple_runs_shouting_600[,i] <-  c(d$Estimate[10], d$Std..Error[10], d$CI.Lower[10], d$CI.Upper[10], d$p.value[10])
  dsl_model_i <- NA
  d <- NA
 
  sub_model_i <- lm_robust(formula = violence_truth ~ sign + photo + fire + police + children + group_20 + flag + night + shouting,   data = df)
  d<- sub_model_i %>% tidy(conf.int = TRUE)
  d<- as.data.frame(d)
  smultiple_runs_sign_600[,i] <- c(d$estimate[2], d$std.error[2], d$conf.low[2], d$conf.high[2], d$p.value[2])
  smultiple_runs_photo_600[,i] <- c(d$estimate[3], d$std.error[3], d$conf.low[3], d$conf.high[3], d$p.value[3])
  smultiple_runs_fire_600[,i] <- c(d$estimate[4], d$std.error[4], d$conf.low[4], d$conf.high[4], d$p.value[4])
  smultiple_runs_police_600[,i] <- c(d$estimate[5], d$std.error[5], d$conf.low[5], d$conf.high[5], d$p.value[5])
  smultiple_runs_children_600[,i] <- c(d$estimate[6], d$std.error[6], d$conf.low[6], d$conf.high[6], d$p.value[6])
  smultiple_runs_group_600[,i]<- c(d$estimate[7], d$std.error[7], d$conf.low[7], d$conf.high[7], d$p.value[7])
  smultiple_runs_flag_600[,i] <- c(d$estimate[8], d$std.error[8], d$conf.low[8], d$conf.high[8], d$p.value[8])
  smultiple_runs_night_600[,i] <- c(d$estimate[9], d$std.error[9], d$conf.low[9], d$conf.high[9], d$p.value[9])
  smultiple_runs_shouting_600[,i] <- c(d$estimate[10], d$std.error[10], d$conf.low[10], d$conf.high[10], d$p.value[10])
  
  # surrogate model
  sur_model_i <- lm_robust(formula = violence ~ sign + photo + fire + police + children + group_20 + flag + night + shouting,   data = df)
  d<- sur_model_i %>% tidy(conf.int = TRUE)
  d<- as.data.frame(d)
  rmultiple_runs_sign_600[,i] <- c(d$estimate[2], d$std.error[2], d$conf.low[2], d$conf.high[2], d$p.value[2])
  rmultiple_runs_photo_600[,i] <- c(d$estimate[3], d$std.error[3], d$conf.low[3], d$conf.high[3], d$p.value[3])
  rmultiple_runs_fire_600[,i] <- c(d$estimate[4], d$std.error[4], d$conf.low[4], d$conf.high[4], d$p.value[4])
  rmultiple_runs_police_600[,i] <- c(d$estimate[5], d$std.error[5], d$conf.low[5], d$conf.high[5], d$p.value[5])
  rmultiple_runs_children_600[,i] <- c(d$estimate[6], d$std.error[6], d$conf.low[6], d$conf.high[6], d$p.value[6])
  rmultiple_runs_group_600[,i]<- c(d$estimate[7], d$std.error[7], d$conf.low[7], d$conf.high[7], d$p.value[7])
  rmultiple_runs_flag_600[,i] <- c(d$estimate[8], d$std.error[8], d$conf.low[8], d$conf.high[8], d$p.value[8])
  rmultiple_runs_night_600[,i] <- c(d$estimate[9], d$std.error[9], d$conf.low[9], d$conf.high[9], d$p.value[9])
  rmultiple_runs_shouting_600[,i] <- c(d$estimate[10], d$std.error[10], d$conf.low[10], d$conf.high[10], d$p.value[10])
  sur_model_i <- NA
  d <- NA
  print(i)

}
```

```{r}

cdsl_sign_600 <- as.data.frame(t(multiple_runs_sign_600))
cdsl_photo_600 <- as.data.frame(t(multiple_runs_photo_600))
cdsl_fire_600 <- as.data.frame(t(multiple_runs_fire_600))
cdsl_police_600 <- as.data.frame(t(multiple_runs_police_600))
cdsl_children_600 <- as.data.frame(t(multiple_runs_children_600))
cdsl_group_600 <- as.data.frame(t(multiple_runs_group_600))
cdsl_flag_600 <- as.data.frame(t(multiple_runs_flag_600))
cdsl_night_600 <- as.data.frame(t(multiple_runs_night_600))
cdsl_shouting_600 <- as.data.frame(t(multiple_runs_shouting_600))

cdsl_sign_600 <- drop_na(cdsl_sign_600)
cdsl_photo_600 <- drop_na(cdsl_photo_600)
cdsl_fire_600 <- drop_na(cdsl_fire_600)
cdsl_police_600 <- drop_na(cdsl_police_600)
cdsl_children_600 <- drop_na(cdsl_children_600)
cdsl_group_600 <- drop_na(cdsl_group_600)
cdsl_flag_600 <- drop_na(cdsl_flag_600)
cdsl_night_600 <- drop_na(cdsl_night_600)
cdsl_shouting_600 <- drop_na(cdsl_shouting_600)

sdsl_sign_600 <- as.data.frame(t(smultiple_runs_sign_600))
sdsl_photo_600 <- as.data.frame(t(smultiple_runs_photo_600))
sdsl_fire_600 <- as.data.frame(t(smultiple_runs_fire_600))
sdsl_police_600 <- as.data.frame(t(smultiple_runs_police_600))
sdsl_children_600 <- as.data.frame(t(smultiple_runs_children_600))
sdsl_group_600 <- as.data.frame(t(smultiple_runs_group_600))
sdsl_flag_600 <- as.data.frame(t(smultiple_runs_flag_600))
sdsl_night_600 <- as.data.frame(t(smultiple_runs_night_600))
sdsl_shouting_600 <- as.data.frame(t(smultiple_runs_shouting_600))

sdsl_sign_600 <- drop_na(sdsl_sign_600)
sdsl_photo_600 <- drop_na(sdsl_photo_600)
sdsl_fire_600 <- drop_na(sdsl_fire_600)
sdsl_police_600 <- drop_na(sdsl_police_600)
sdsl_children_600 <- drop_na(sdsl_children_600)
sdsl_group_600 <- drop_na(sdsl_group_600)
sdsl_flag_600 <- drop_na(sdsl_flag_600)
sdsl_night_600 <- drop_na(sdsl_night_600)
sdsl_shouting_600 <- drop_na(sdsl_shouting_600)

won_protest_machine$violence_truth <- won_protest_human$violence
truth_model <- lm_robust(formula = violence_truth ~ sign + photo + fire + police + children + group_20 + flag + night + shouting,   data = won_protest_machine)

truth_tidy <- truth_model  %>% tidy(conf.int = TRUE)
t_sign <- truth_tidy$estimate[2]
t_photo <- truth_tidy$estimate[3]
t_fire <- truth_tidy$estimate[4]
t_police <- truth_tidy$estimate[5]
t_children <- truth_tidy$estimate[6]
t_group <- truth_tidy$estimate[7]
t_flag <- truth_tidy$estimate[8]
t_night <- truth_tidy$estimate[9]
t_shouting <- truth_tidy$estimate[10]

t_up_sign <- truth_tidy$conf.high[2]
t_up_photo <- truth_tidy$conf.high[3]
t_up_fire <- truth_tidy$conf.high[4]
t_up_police <- truth_tidy$conf.high[5]
t_up_children <- truth_tidy$conf.high[6]
t_up_group <- truth_tidy$conf.high[7]
t_up_flag <- truth_tidy$conf.high[8]
t_up_night <- truth_tidy$conf.high[9]
t_up_shouting <- truth_tidy$conf.high[10]

t_l_sign <- truth_tidy$conf.low[2]
t_l_photo <- truth_tidy$conf.low[3]
t_l_fire <- truth_tidy$conf.low[4]
t_l_police <- truth_tidy$conf.low[5]
t_l_children <- truth_tidy$conf.low[6]
t_l_group <- truth_tidy$conf.low[7]
t_l_flag <- truth_tidy$conf.low[8]
t_l_night <- truth_tidy$conf.low[9]
t_l_shouting <- truth_tidy$conf.low[10]
```

```{r}
# DSL
cdsl_photo_600["covered"] <- with(cdsl_photo_600, t_photo >= conf.low & t_photo <= conf.high)
cdsl_group_600["covered"] <- with(cdsl_group_600, t_group >= conf.low & t_group <= conf.high)
cdsl_night_600["covered"] <- with(cdsl_night_600, t_night >= conf.low & t_night <= conf.high)


photo_dsl_percent_600 <- sum(cdsl_photo_600["covered"], na.rm = TRUE)/nrow(cdsl_photo_600["covered"])
group_dsl_percent_600 <- sum(cdsl_group_600["covered"], na.rm = TRUE)/nrow(cdsl_group_600["covered"])
night_dsl_percent_600 <- sum(cdsl_night_600["covered"], na.rm = TRUE)/nrow(cdsl_night_600["covered"])


# ResNet

rdsl_photo_600 <- as.data.frame(t(rmultiple_runs_photo_600))
rdsl_group_600 <- as.data.frame(t(rmultiple_runs_group_600))
rdsl_night_600 <- as.data.frame(t(rmultiple_runs_night_600))


rdsl_photo_600["covered"] <- with(rdsl_photo_600, t_photo >= conf.low & t_photo <= conf.high)
rdsl_group_600["covered"] <- with(rdsl_group_600, t_group >= conf.low & t_group <= conf.high)
rdsl_night_600["covered"] <- with(rdsl_night_600, t_night >= conf.low & t_night <= conf.high)


photo_m_percent_600 <- sum(rdsl_photo_600["covered"], na.rm = TRUE)/nrow(rdsl_photo_600["covered"])
group_m_percent_600 <- sum(rdsl_group_600["covered"], na.rm = TRUE)/nrow(rdsl_group_600["covered"])
night_m_percent_600 <- sum(rdsl_night_600["covered"], na.rm = TRUE)/nrow(rdsl_night_600["covered"])



barplots <- data.frame(rows = c( "photo", "group", "night"), "DSL" = c( photo_dsl_percent_600,group_dsl_percent_600, night_dsl_percent_600), "ResNet (SO)" = c(photo_m_percent_600, group_m_percent_600,  night_m_percent_600))

df2 <- reshape::melt(barplots, id = c("rows"))
df2$variable <- factor(df2$variable, labels = c("DSL", "ResNet (SO)"))
won1 <- ggplot(data = df2, aes(x = rows, y = value)) + facet_wrap(~ variable) + geom_point(alpha = 0.7, size = 3) + labs(title = "Coverage of 95% confidence intervals", y = "Coverage percentage", x = "Independent variables") +theme(plot.title=element_text(face="bold")) + geom_hline(yintercept = 0.95, linetype="dashed", col = "blue") + theme_bw()  +  # Use grey color scale
  theme(
    axis.text.x = element_text(size = 14),  # Make x-axis labels larger
    axis.text.y = element_text(size = 14),  # Adjust y-axis labels size
    axis.title = element_text(size = 14),  # Adjust axis titles
    plot.title = element_text(size = 15),  # Adjust plot title size
    legend.text = element_text(size = 14),  # Adjust legend text size
    strip.text = element_text(size = 14)  # Make facet label text larger
  )
```

## Figure 6

```{r}
machine_tidy <- data.frame("term" = c( "photo", "group>20", "night"), "estimate" = c( mean(rdsl_photo_600$estimate),mean(rdsl_group_600$estimate),  mean(rdsl_night_600$estimate)), "std.error" = c(mean(rdsl_photo_600$std.error), mean(rdsl_group_600$std.error),  mean(rdsl_night_600$std.error)), "p.value" = c( mean(rdsl_photo_600$p.value), mean(rdsl_group_600$p.value),  mean(rdsl_night_600$p.value)), "conf.low" = c( mean(rdsl_photo_600$conf.low),  mean(rdsl_group_600$conf.low), mean(rdsl_night_600$conf.low)), "conf.high" = c( mean(rdsl_photo_600$conf.high),  mean(rdsl_group_600$conf.high), mean(rdsl_night_600$conf.high)))

dsl_tidy <- data.frame("term" = c("photo",  "group>20","night"), "estimate" = c(mean(cdsl_photo_600$estimate), mean(cdsl_group_600$estimate), mean(cdsl_night_600$estimate)), "std.error" = c(mean(cdsl_photo_600$std.error), mean(cdsl_group_600$std.error),  mean(cdsl_night_600$std.error)), "p.value" = c(mean(cdsl_photo_600$p.value),mean(cdsl_group_600$p.value),  mean(cdsl_night_600$p.value)), "conf.low" = c(mean(cdsl_photo_600$conf.low), mean(cdsl_group_600$conf.low), mean(cdsl_night_600$conf.low)), "conf.high" = c(mean(cdsl_photo_600$conf.high),mean(cdsl_group_600$conf.high),  mean(cdsl_night_600$conf.high)))
truth_tidy <- truth_tidy[,c("estimate", "std.error", "p.value", "conf.low", "conf.high")]
truth_tidy$term <- c("Intercept", "sign", "photo", "fire", "police", "children", "group>20", "flag", "night", "shouting")


```

```{r}
truth_tidy <- truth_tidy %>% subset(term %in% c("photo","group>20", "night"))
alltogether <- rbind(dsl_tidy, machine_tidy, truth_tidy)
alltogether$model <- c("DSL", "DSL", "DSL", "ResNet (SO)", "ResNet (SO)", "ResNet (SO)", "Benchmark", "Benchmark", "Benchmark")

truth_baselines <- data.frame(
  term = c("photo","group>20", "night"),
  truth_value = c(t_photo, t_group, t_night)
)

alltogether$model <- factor(alltogether$model, levels = c("DSL", "ResNet (SO)", "Benchmark"))

# Plot with term-specific baselines
won2 <- alltogether %>%
  filter(term != "(Intercept)") %>%
  ggplot(aes(estimate, model, shape = model))  + facet_wrap(~ term) +
  scale_shape_manual(values = 0:9) + 
  geom_point(show.legend = FALSE) +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), show.legend = FALSE) +
  # Add variable-specific dotted lines
  geom_vline(data = truth_baselines, aes(xintercept = truth_value), lty = 2) +
  coord_cartesian(xlim = c(-0.1, 0.2)) +
  labs(x = "Estimate",
       y = NULL,
       title = "Estimates and confidence intervals for predictors of perceived level of violence") +
  theme_bw() +
  scale_color_grey() +  # Use grey color scale
  theme(
    axis.text.x = element_text(size = 14),  # Make x-axis labels larger
    axis.text.y = element_text(size = 14),  # Adjust y-axis labels size
    axis.title = element_text(size = 14),  # Adjust axis titles
    plot.title = element_text(size = 15),  # Adjust plot title size
    legend.text = element_text(size = 14),  # Adjust legend text size
    strip.text = element_text(size = 14)  # Make facet label text larger
  )
```

```{r}
require(gridExtra)
```
